import re
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from lxml import etree


class spider():
    def __init__(self,arg):
        # ua = UserAgent()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
        self.session = requests.session()
        self.kwargs = arg
        if not 'charset' in arg:
            self.charset = 'utf-8'
        else:
            self.charset = arg['charset']

    def get_data(self,url):
        if url:
            try:
                # print("===="+url)
                res = self.session.get(url, headers=self.headers)
                # print(url)
                # print(res.status_code)
                if res.status_code == 200:
                    try:
                        temcont = res.content.decode('utf-8')
                    except UnicodeDecodeError as e:
                        temcont = res.content.decode('gb2312',errors='ignore')
                    charsets = re.findall('<meta.*?charset=[\'\"]?(.*?)[\'\"]+.*?/>',temcont)
                    if charsets:
                        charset = charsets[0]
                    else:
                        charset = 'utf-8'
                    data = {}
                    con = res.content.decode(charset, errors='ignore')
                    data['title'] = self.replace(self.get_formrex(con, self.kwargs['match_title'], self.kwargs['match_title_retype']),self.kwargs['trim_title'])
                    data['content'] = self.replace(self.get_formrex(con, self.kwargs['match_body'], self.kwargs['match_body_retype']),self.kwargs['trim_body'])
                    data['content'] = data['content'].encode('utf-8').decode();
                    return 1,data
                else:
                    return -1,res.status_code
            except Exception as e:
                print(e)
                return 0,None
        return 0,None

    def get_formrex(self,data,rex,type=1):
        if type == 1:
            return self.get_formrex_re(data, rex)
        elif type == 2:
            return self.get_formrex_class(data, rex)
        elif type == 3:
            return self.get_formrex_xpath(data, rex)
        else:
            return None

    def get_formrex_re(self,content, reurlrex):
        try:
            rex = re.findall(reurlrex, content)
            return rex[0]
        except Exception as e:
            return None

    def get_formrex_class(self, content, class_):
        try:
            soup = BeautifulSoup(content, 'lxml')
            list1 = soup.select(class_)
            if len(list1)>0:
                con = list1[0].contents
                con1 = map(str,con)
                con1 = ''.join(con1)
                return con1
            return None
        except Exception as e:
            return None

    def get_formrex_xpath(self, content, xpath):
        try:
            html = etree.HTML(content)
            res = html.xpath(xpath)
            c = map(self.retost, res)
            c1 = ''.join(c)
            return c1
        except Exception as e:
            return None

    def retost(res):
        return etree.tostring(res, encoding='utf-8').decode('utf-8')

    def replace_xpaser(self,strs):
        s = re.findall(r'{replace.*?key=\"(.*?)\"}(.*?){/replace}',strs)
        if len(s)>0:
            return s[0]
        else:
            return None

    def replace(self,strs,rex):
        strslist = rex.splitlines()
        if len(strslist)>0:
            for i in strslist:
                f = self.replace_xpaser(i)
                if f:
                    strs = re.sub(f[0], f[1], strs)
        return strs




